*******************************************************************************
* This file cleans the subsidy data that is collected by the author. 
* This includes calculating any non-discretionary subsidy, 
*******************************************************************************
*******************************************************************************
**** 1. Potential non-discretionary subsidy from state spending data: 
*******************************************************************************
u $datadir/raw/state_level_spending, clear 
egen job_credits = rowtotal(jobs_credits train_credits)
recode job_credit (0=.) 
keep year state job_credits zone_credits invest_credits 

statastates, abbrev(state) nogen 

sa $temp/credits, replace 

sort state year 

foreach v in job zone invest {
	preserve 
		drop if `v'_credits==.
		by state: egen min_year`v'=min(year)
		by state: egen max_year`v'=max(year)
		replace max_year=2017 if max_year==2016 // don't have data in 2017 but need to extrapolate to last year of subsidy data
		keep state min max 
		duplicates drop 
		sa $temp/year`v', replace 
	restore 
}

insheet using $datadir/raw/bds_size_state.csv, comma clear
keep if regexm(size, "m. ALL")

collapse (sum) job_creation_births, by(state year)

rename year year 
rename state state_fips 

merge 1:1 state_fips year using $temp/credits, keep(2 3)

foreach v in job zone invest { 
	g credit`v' = `v'_credits/job_creation_births 
}
drop if state=="DC" | state=="HI"

collapse (mean) credit*, by(state)

summ creditinvest 
summ creditjob 
summ creditzone 

foreach v in invest job zone {
	merge 1:1 state using $temp/year`v', nogen keep(1 3)
}

sa $temp/tax_perjob_all, replace 

*******************************************************************************
**** 2. Prep for merge 
*******************************************************************************
import excel using "$datadir/raw/mega_threat_clean.xlsx", firstrow clear
replace state=strtrim(state)
replace county=strtrim(county)

egen jobs_total=rowtotal(jobs_direct jobs_retain)

*get state fips, full name
statastates, abbrev(state)
keep if _m==3
drop _m 

rename state_fips statefip
split county, parse(",")
g countyname=strtrim(county1) 
drop county1 county2 county3
replace countyname = lower(countyname)
replace countyname = subinstr(countyname, "st. ", "st ", .)

order state state_name statefip county countyname year naics4
replace state_name = proper(state_name)

*******************************************************************************
*3. adjust for other tax credits/incentives [hand collected by author]
*******************************************************************************
****R&D TAX CREDIT 
//need r&d tax credit rate and r&d spend per industry [this file is from wilson/moretti and NSF on R&D emp/spend by industry]
merge m:1 year state_name naics4 using $datadir/raw/state_naics_RD, keep(1 3) ///
	keepusing(rd_credit_level rd_emp) nogen

//need industry average wages [created in make_ind.do]
merge m:1 countyname year naics4 statefip using $datadir/wage_naics_rect, keep(1 3) nogen

//Assumption: All employee wages and 20% of investment is eligible for r&d tax credit 
g av_wage = wage_county
replace av_wage = wage_cz if av_wage==.
adjust_inflation av_wage wage_county wage_cz, year(2017) 

//creating amount of r&d credit, in millions:
g RDcredit=rd_credit_level*invest_M/5 + rd_credit_level*av_wage*jobs_direct/1000 ///
	if other_inc==1 & rd_emp>.1
recode RDcredit (.=0)
	
//Incentives besides R&D tax credit are included for CA and NC, so do not need to calculate in next section
replace other_incentives=0 if inlist(state, "CA", "NC")

****JOBS, EZ, INVESTMENT TAX CREDITS
//here credits are often formatted as $ per job. 
//Therefore, use tax credit expenditure reports to track spending, and normalize by jobs created [collected by author]
merge m:1 state using $temp/tax_perjob_all, keep(1 3) nogen 
recode credit* (.=0)

//Assumptions to apply tax credit expenditures to subsidized firms:
*1. 5 years of credit per job
*2. Only new jobs for job/EZ credits
*3. Any jobs for investment
*4. Minimum investment for investment credits $200M
g pot_jobs_credit = 5*jobs_direct*creditjob if inrange(year, min_yearjob, max_yearjob)
g pot_invest_credit = 5*jobs_total*creditinvest if inrange(year, min_yearinvest, max_yearinvest) & invest>200
g pot_zone_credit =  5*jobs_direct*creditzone if inrange(year, min_yearzone, max_yearzone)
recode pot_* (.=0)

g additional_credit = other_inc*(pot_jobs + pot_invest + pot_zone)/1000000 

rename sub_M orig_sub_M
//scale up the subsidies with this other potential credits 
g sub_M = orig_sub_M + additional_credit + RDcredit 

drop rd_credit rd_emp other_incentives RDcredit - additional_credit fips
order firm state state_name statefip county countyname fipscty year sub_M orig_sub naics4 invest jobs*
replace firm=strtrim(firm)

//put everything in 2017 dollars 
adjust_inflation sub_M invest_M, year(2017)

****************************************************************************************
* 4 more data cleaning
************************************************************************************
*****clean up number of bidders
destring n_bidders, gen(n_bid) force ignore("+")
rename n_bidders text_n_bid
replace n_bid= 5 if n_bid==. //use average number of bidders if missing

label var firm "firm name"
label var fipscty "county fips code"
label var countyname "county full name"
label var sub_M "subsidy size ($ M)"
label var orig_sub_M "discretionary subsidy reported ($ M)"
label var invest_M "investment planned ($ M)"
label var jobs_direct "number of new jobs promised at firm"
label var jobs_retain "number of jobs retained by firm"
label var jobs_total "total jobs (direct + retained)"
label var threat_clean "threat states"
label var threat_county "threat counties"
label var v_2 "runner-up subsidy offer ($ M)"
label var win_present "is firm already present in subsidized locale?"
label var threat_present1 "firm present in runner-up (1st runner-up)"
label var threat_present2 "firm present in runner-up (2nd runner-up)"
label var threat_present3 "firm present in runner-up (3rd runner-up)"
label var threat_present4 "firm present in runner-up (4th runner-up)"
label var threat_present5 "firm present in runner-up (5th runner-up)"
label var n_bid "number of bidders for analysis"
label var text_n_bid "number of bidders from appendix"
label var wage_county "average county level wages ($ 1000)"
label var wage_cz "average commuting zone level wages ($ 1000)"
label var av_wage "average county wages (use cz when county missing)($ 1000)"

egen unique_id = group(firm fipscty countyname sub_M orig_sub_M invest_M jobs_direct)
	
****************************************************************************************
* 5 reshape 
************************************************************************************
//going to reshape so that is runner-up/winner pairs (relevant for observations with multiple runner-ups)
	rename threat_clean ru_state
	rename threat_county ru_county
	split ru_state, parse(",") 
	split ru_county, parse(",") 
		
	g id=_n 
	drop ru_state ru_county
	reshape long ru_county ru_state threat_present, i(year state county sub_M jobs* id win_present n_bid)
	drop if ru_state==""
	drop _j
		
	replace ru_state=strtrim(ru_state)
	replace ru_county=strtrim(ru_county)

//runner-up offers:	
	destring v_2, force gen(ru_offer)
	replace ru_offer=100 if v_2=="100 (GA)" & ru_state=="GA"
	replace ru_offer=7 if v_2=="7 (OH)" & ru_state=="OH"
	replace ru_offer=800 if v_2=="400 (similar?)"
	replace ru_offer=90 if v_2=="90 (state)"
	replace ru_offer=56.9 if regexm(v_2, "56.9") & ru_state=="NY"
	replace ru_offer=5.6 if regexm(v_2, "56.9") & ru_state=="AZ"
	replace ru_offer=9.8 if regexm(v_2, "56.9") & ru_state=="TX"
	drop v_2
	
	adjust_inflation ru_offer, year(2017)
	
	recode *_present (.=0)
	g diff_present=(threat_present - win_present)
	
//sectors:
	g naics3=floor(naics4/10)
	g naics2=floor(naics3/10)
	replace naics2=31 if inlist(naics2,32,33)
	replace naics2=44 if naics2==45
	replace naics2=48 if naics2==49
	
//clean up runner-up county name:
g countyname_ru=strtrim(ru_county)
replace countyname_ru = lower(countyname_ru)
replace countyname_ru = subinstr(countyname_ru, "st. ", "st ", .)

//industry multiplier variables [from EPI]
merge m:1 naics4 using "$datadir/raw/naics_multiplier", keep(1 3) nogen 
	
sa $datadir/subsidy_data, replace 
	